# -*- coding: utf-8 -*-
"""
Created on Mon Jun 12 10:50:23 2023

@author: avery
"""

import re  # for regular expressions
import os  # to look up operating system-based info
import string  # to do fancy things with strings
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score



col_list = ["text"] # columns you want to use, can change to whatever

# read in documents
df1 = pd.read_csv(r'FILEPATH', usecols= col_list)
df2 = pd.read_csv(r'FILEPATH', usecols= col_list)
df3 = pd.read_csv(r'FILEPATH', usecols= col_list)
df4 = pd.read_csv(r'FILEPATH', usecols= col_list)
frames = [df1, df2, df3, df4]
df5 = pd.concat(frames)
df5['label'] = 'not_recipe'

df6 = pd.read_csv(r'FILEPATH', usecols= col_list)
df6['label'] = 'recipe'
frames = [df5, df6]
data = pd.concat(frames, ignore_index=True)

def clean_text(text):
    '''
    Cleans the given text using regular
    expressions to split and lower-cased versions to create
    a list of tokens for each text.
    Parameters:
        list_of_texts: list of str 
    Return: list of lists of tokens, one list per text
    '''
   
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))

    # lower case
    tokens = text.split()
    tokens = [t.lower() for t in tokens]
    # remove punctuation
    tokens = [re_punc.sub('', token) for token in tokens] 
    # only include tokens that aren't numbers
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

data['text'] = data['text'].fillna(' ')  #filter out any null values
# data['text'] = data['text'].apply(clean_text)


train, test = train_test_split(data, test_size=0.3, random_state=42)

train_set = train.apply(lambda r: TaggedDocument(
    words = clean_text(r['text']),
    tags = [r.label]),
    axis = 1)

test_set = test.apply(lambda r: TaggedDocument(
    words = clean_text(r['text']),
    tags = [r.label]),
    axis = 1)

model = Doc2Vec(vector_size=100,
window=5, 
alpha=.025, 
min_alpha=0.00025, 
min_count=2, 
dm=1, 
workers=8)
model.build_vocab([x for x in train_set.values])

train_documents  = utils.shuffle([x for x in train_set.values])

for epoch in range(100):
    print("Epoch number: ", epoch)
    model.train(train_set, 
                total_examples = len(train_set.values), 
                epochs=1)
    
model.save('doc2vec_recipes.model')

def vector_for_learning(model, tagged_documents):
    sents = tagged_documents.values
    targets, vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, vectors


y_train, X_train = vector_for_learning(model, train_set)
y_test, X_test = vector_for_learning(model, test_set)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)


print('Accuracy score is: %s' % accuracy_score(y_test, y_pred))
